template <unsigned BSize, typename Value>
struct multi_tmp{
typedef multi_tmp<BSize-1, Value> sub_type;
multi_tmp(const Value& v):value(v), sub(v) {}
Value value;
sub_type sub;
Value sum() const { return value+sub.sum(); }
};
template <typename Value>
struct multi_tmp<0, Value>{
multi_tmp(const Value& v) {}
Value sum() const { return 0; }
};
template <unsigned Index0, unsigned Max0, unsigned Index1, unsigned Max1>
struct mult_block{
typedef mult_block<Index0, Max0, Index1+1, Max1> next;
template <typename Tmp, typename Matrix>
void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){
tmp.value+=A(i+Index0, j)*B(j, k+Index1);
next()(tmp.sub, A, B, i, j, k);
}
template <typename Tmp, typename Matrix>
void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){
C(i+Index0, k+Index1)=tmp.value;
next().update(tmp.sub, C, i, k);
}
};
template <unsigned Index0, unsigned Max0, unsigned Max1>
struct mult_block<Index0, Max0, Max1, Max1>{
typedef mult_block<Index0+1, Max0, 0, Max1> next;
template <typename Tmp, typename Matrix>
void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){
tmp.value+=A(i+Index0, j)*B(j, k+Max1);
next()(tmp.sub, A, B, i, j, k);
}
template <typename Tmp, typename Matrix>
void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){
C(i+Index0, k+Max1)=tmp.value;
next().update(tmp.sub, C, i, k);
}
};
template <unsigned Max0, unsigned Max1>
struct mult_block<Max0, Max0, Max1, Max1>{
template <typename Tmp, typename Matrix>
void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){
tmp.value+=A(i, Max0, j)*B(j, k+Max1);
}
template <typename Tmp, typename Matrix>
void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){
C(i+Max0, k+Max1)=tmp.value;
}
};
template <unsigned Size0, unsigned Size1, typename Matrix>
inline void mult(const Matrix& A, const Matrix& B, Matrix& C){
using value_type=typename Matrix::value_type;
unsigned s=A.num_rows();
mult_block<0, Size0-1, 0, Size1-1> block;
for(unsigned int i=0; i<s; i+=Size0){
for(unsigned int k=0; k<s; k+=Size1){
mult_tmp<Size0*Size1, value_type> tmp(value_type(0));
for(unsigned j=0; j<s; j++) block(tmp, A, B, i ,j, k);
block.update(tmp, C, i, k);
}
}
}